package eu.socialsensor.twcollect; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.jsoup.Connection; import org.jsoup.HttpStatusException; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.select.Elements; /** * This is a wrapper class that downloads the HTML page for a single tweet * and extracts the fields of interest (with the help of jsoup). * @author kleinmind * */ public class TweetFieldsFetcher { // constants private static final String TWITTER = "http://twitter.com/"; private static final String DEFAULT_TWITTER_STATUS_ROOT = "username/status/"; public TweetFieldsResponse fetchTweetFields(String tweetId){ // generate twitter URL (this one creates a redirect!) String twitterURL = TWITTER + DEFAULT_TWITTER_STATUS_ROOT + tweetId; // keep track of time long time0 = System.currentTimeMillis(); Connection.Response response = null; try { // this is where the download takes place response = Jsoup.connect(twitterURL).execute(); } catch (HttpStatusException e) { System.err.println(twitterURL); return createResponse(new TweetFields(tweetId, null, null, null), e.getStatusCode(), time0, false); } catch (IOException e) { e.printStackTrace(); return createOtherErrorResponse(tweetId, time0); } // parsing the returned HTML code Document doc = null; try { doc = response.parse(); } catch (IOException e) { e.printStackTrace(); return createParseErrorResponse(tweetId, time0); } // extract parsed id String resolvedUrl = response.url().getPath(); int lastSlashIdx = resolvedUrl.lastIndexOf('/'); String originalId = resolvedUrl.substring(lastSlashIdx+1); // extract username String username = doc.select(".user-actions").attr("data-screen-name"); // extract text Elements textEl = doc.select(".js-tweet-text"); // this is not the most appropriate way to check if (textEl == null || textEl.first() == null){ // is the user suspension the only reason for getting a null text? System.err.println(twitterURL + " (suspeneded)"); return createResponse(new TweetFields(tweetId, null, null, null), response.statusCode(), time0, true); } String text = textEl.first().text(); // should always be there, but not correct for response tweets String publicationTime = doc.select(".tweet-timestamp").attr("title"); Elements mainEl = doc.select(".permalink-tweet"); if (mainEl != null && mainEl.first() != null){ Elements inEl = mainEl.first().select(".js-tweet-text"); if (inEl != null && inEl.first() != null){ text = inEl.first().text(); } // get correct publication time publicationTime = mainEl.select(".metadata").first().text(); } // get tweets to which this tweet replies (if available) Elements repEl = doc.select(".permalink-in-reply-tos"); String[] responseTos = null; if (repEl != null && repEl.first() != null){ List<String> respIds = new ArrayList<String>(); Elements inEl = repEl.first().select(".simple-tweet"); if (inEl != null){ for (int i = 0; i < inEl.size(); i++){ respIds.add(inEl.get(i).attr("data-tweet-id")); } } responseTos = new String[respIds.size()]; for (int i = 0; i < respIds.size(); i++){ responseTos[i] = respIds.get(i); } } // retweets (if available) Elements numRetweetsEl = doc.select(".stats .js-stat-retweets a strong"); int numRetweets = 0; if (numRetweetsEl.text().length()>0) { numRetweets = Integer.parseInt(numRetweetsEl.text().replaceAll(String.valueOf((char)160),"").replaceAll(",","")); } // favorites (if available) Elements numFavoritesEl = doc.select(".stats .js-stat-favorites a strong"); int numFavorites = 0; if (numFavoritesEl.text().length()>0){ numFavorites = Integer.parseInt(numFavoritesEl.text().replaceAll(String.valueOf((char)160),"").replaceAll(",","")); } TweetFields tweetFields = null; if (tweetId.equals(originalId)) { // original tweet tweetFields = new TweetFields(tweetId, username, text, publicationTime, numRetweets, numFavorites, null, responseTos); } else { tweetFields = new TweetFields(tweetId, username, text, publicationTime, numRetweets, numFavorites, originalId, null); } return createResponse(tweetFields, response.statusCode(), time0, false); } private TweetFieldsResponse createResponse(TweetFields tweetFields, int status, long time0, boolean suspended){ return new TweetFieldsResponse(tweetFields, status, (int)(System.currentTimeMillis() - time0), suspended, false, false); } private TweetFieldsResponse createParseErrorResponse(String tweetId, long time0){ return new TweetFieldsResponse(new TweetFields(tweetId, null, null, null), -1, (int)(System.currentTimeMillis() - time0), false, true, false); } private TweetFieldsResponse createOtherErrorResponse(String tweetId, long time0){ return new TweetFieldsResponse(new TweetFields(tweetId, null, null, null), -1, (int)(System.currentTimeMillis() - time0), false, false, true); } }